Spatial Analytics Code Challenge¶import pandas as pd
import numpy as np
import json
import os, sys
import matplotlib.pyplot as plt
import datetime
from collections import Counter, defaultdict
import pickle
import warnings
warnings.filterwarnings("ignore")
# ML algorithms
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
%matplotlib inline
%autosave 180
plt.style.use('seaborn-whitegrid')
# Setting up global vairables for folder paths to read and save files
# Make sure to modify the path! Otherwise, this notebook may not reproduce the same result
blackstone_folder_path = "/Users/soohyunlee/Desktop/python/blackstone/chlor_a"
pickle_folder_path = "/Users/soohyunlee/Desktop/python/blackstone/solution/pickles"
results_folder_path = "/Users/soohyunlee/Desktop/python/blackstone/solution/results"
**Key facts about the data and data preprocessing decisions: **
# Data Evaluation Example
corr_file_num = 0
for file in os.listdir(blackstone_folder_path):
data = pd.read_csv("/".join([blackstone_folder_path, file]))
if data.shape != (720,1201): # Checking if the data has different number of entries
print (file)
else:
corr_file_num += 1
print ("Checked {} files and {} have the same shape".format(len(os.listdir(blackstone_folder_path)), corr_file_num))
# Running for the first time
def run_init():
"""
Reading all 203 files and parsing the raw data as a defaultdict(list of df)
"""
data_collection = defaultdict(list)
for file_name in os.listdir(blackstone_folder_path):
if file_name.endswith("csv"):
year, doy = int(file_name[8:12]), int(file_name[12:15])
record_date = (datetime.datetime(year, 1, 1) + datetime.timedelta(doy-1))
df = pd.read_csv(("/").join([blackstone_folder_path, file_name]), index_col = 0)
df = df.unstack().reset_index()
df.rename(columns={"level_0":"longitude", "level_1": "latitude", 0:"chlor_a_val"}, inplace=True)
# Adding date attributes
df["year"] = record_date.year
df["month"] = record_date.month
df["day"] = record_date.day
df["doy"] = doy
data_collection[record_date.year] += [df]
return data_collection
# Concatanating the dataframes
def create_data_dict_raw(data_collection,
lat_min=None, lat_max=None, long_min=None, long_max=None):
data_dict_raw = {}
for year, data in data_collection.items():
df_temp = pd.concat(data_collection[year]).reset_index(drop=True)
# "Longitude" is recorded as "string", so need to convert to "float"
df_temp["longitude"] = df_temp["longitude"].astype(float)
# Adjusting the longitude for the correct range between -50° and -0° E following the instruction
# This will get rid of ~50% of the data, speeding up the computation
if (lat_min and lat_max and long_min and long_max):
df_temp = df_temp[(df_temp.longitude <= long_max) &
(df_temp.longitude >= long_min) &
(df_temp.latitude < lat_max) &
(df_temp.latitude >= lat_min) ]
else:
df_temp = df_temp[(df_temp.longitude < 0) & (df_temp.longitude >= -50) ]
data_dict_raw[year] = df_temp
return data_dict_raw
# For dictionary and defaultdict objects
# Saving to the pickle object
def save_to_pickle(data_object, file_name):
with open(("/").join([pickle_folder_path, file_name+".pickle"]), "wb") as handle:
pickle.dump(data_object, handle,
protocol=pickle.HIGHEST_PROTOCOL)
# Loading from the pickle object
def read_from_pickle(file_name):
with open(("/").join([pickle_folder_path, file_name+".pickle"]), "rb") as handle:
data_object = pickle.load(handle)
return data_object
data_collection = run_init()
save_to_pickle(data_object = data_collection,
file_name = "data_collection")
data_dict_raw = create_data_dict_raw(data_collection=data_collection)
save_to_pickle(data_object = data_dict_raw,
file_name = "data_dict_raw")
data_collection = read_from_pickle("data_collection")
data_dict_raw = read_from_pickle("data_dict_raw")
# Annual Average
def get_annual_chlor_avg(print_output=False):
avg_chlor = []
for year, df in data_dict_raw.items():
annual_avg = df.chlor_a_val.mean()
avg_chlor.append(annual_avg)
if print_output:
print ("Year: {}, Avg Chlorophyll-a concentration: {}".format(year, annual_avg))
fig, ax = plt.subplots(1,1, figsize=(10,6))
ax.plot(range(2002,2020), avg_chlor, lw=2)
ax.grid(alpha=0.5)
ax.set_title("Historical Annual Chlorophyll-a Concentration in the North Atlantic Ocean")
ax.set_xlabel("Year")
ax.set_ylabel("Avg Chlorophyll-a Concentration (mg/m^-3)")
ax.set_xlim(2002,2019)
ax.set_xticks(np.arange(2002, 2020, 1));
get_annual_chlor_avg(print_output=False)
**Comment:**
The plot above illustrates an upward trend over time between 2002 and 2019. It would be worthwhile to investigate the effect of detrending for any further analysis as its significance maybe a good indicator for climatic (long-term) changes. Also, note that year 2002 and 2019 have different numbers of data points (2012: July-Dec, 2019: Jan-May). This may dragged the graph more downward and upward relatively.
# Monthly Average within a particular year
def get_monthly_avg(data_dict_raw=data_dict_raw, year=2003, print_output=False): # example year is 2002
avg_chlor = []
if print_output:
print("- For Year: {}".format(year), "\n")
for month, df in data_dict_raw[year].groupby("month"):
monthly_avg = df.chlor_a_val.mean()
avg_chlor.append(monthly_avg)
if print_output:
print ("Month: {}, Avg Chlorophyll-a concentration: {}".format(month, monthly_avg))
fig, ax = plt.subplots(1,1, figsize=(10,6))
ax.plot(range(1,13), avg_chlor, lw=2)
ax.grid(alpha=0.5)
ax.set_title("Monthly Chlorophyll-a Concentration in the North Atlantic Ocean for {}".format(year))
ax.set_xlabel("Month")
ax.set_ylabel("Avg Chlorophyll-a Concentration (mg/m^-3)")
ax.set_xlim(1,12)
ax.set_xticks(np.arange(1, 13, 1));
# Example
get_monthly_avg(year=2003, print_output=False)
**Comment:**
It appears that the chlorophyll-⍺ is highest in early summer (May-June) and the lowest in the winter (Nov-Jan). Such a short-term variation could be caused by the changes in cloud cover, which can reduce or increase solar energy available for photosynthesis.
fig, ax = plt.subplots(1,1, figsize=(15,8))
cmap=plt.get_cmap("Blues")
colors = cmap(np.linspace(0, 1, num=18))
num = 0
for year, df in data_dict_raw.items():
x_months, y_values = [], []
for month, df_2 in data_dict_raw[year].groupby("month"):
x_months.append(month)
percentage_missing = df_2.chlor_a_val.isnull().sum()/len(df_2.chlor_a_val)*100
y_values.append(percentage_missing)
ax.plot(x_months, y_values, label=year, lw=1, c=colors[num])
num += 1
ax.grid(alpha=0.5)
ax.legend(bbox_to_anchor=(1.1, 1))
ax.set_title("Historical Percentage of Missing Data by Month")
ax.set_xlabel("Month")
ax.set_ylabel("Percentage of Missing Data (%)")
ax.set_xlim(1,12)
ax.set_ylim(20, 60)
ax.set_xticks(np.arange(1, 13, 1));
**Comment:**
On average, the amount of missing data is lowest in September and highest in December. Assuming that the landmasses remains the same, the abrupt increase of the missing values could be caused by the amount of clouds or the large area of sea ice around Antarctica in the winter. For this reason, data points in September was used to plot the land boundary in the plot below. However, using land masks, such as NLCD (National Land Cover Data), is a more appropriate and accurate choice to mask out the land area.
def evaluate_geographic_bias(year=2003, start_month=1, end_month=12):
df = data_dict_raw[year]
cmap=plt.get_cmap("Blues")
colors = cmap(np.linspace(0, 1, num = end_month-start_month+1))
fig, ax = plt.subplots(1,1,figsize=(20,16))
num = 0
df_temp = df[df.isnull().any(axis=1)]
# for month, df in df_temp.groupby("month"): # more efficient, if plotting for all 12 months
for x in range (start_month, end_month+1):
df = df_temp[df_temp.month == x]
ax.scatter(x=df.longitude, y=df.latitude, alpha=0.3, color=colors[num], label=x)
num +=1
# Plot September as it has the least amount of missing values. Most likely to show the most accurate landmass.
ax.scatter(x=df_temp[df_temp.month == 9].longitude, y=df_temp[df_temp.month == 9].latitude,
alpha=0.3, color='yellow', label="Land")
ax.set_title("Amount of Missing Data for {}: {} - {}".format(year, start_month, end_month))
ax.set_xlim(df_temp.longitude.min(), df_temp.longitude.max())
ax.set_ylim(df_temp.latitude.min(), df_temp.latitude.max())
ax.set_xlabel("Longitude")
ax.set_ylabel("Latitude")
ax.legend(bbox_to_anchor=(1.1, 1));
fig.savefig(("/").join([results_folder_path, str(year)+ "_" + str(start_month)+"_"+str(end_month)+".png"]))
evaluate_geographic_bias(year=2003, start_month=1, end_month=10)
**• If the image below is not showing, please check the folder & directory! **

**Comment:**
As the images above show, there is a significant geographic variability in the amount of missing data. The area around Antarctica (40-70°N) tends to have a lot more missing data compared to the central North Atlantic Ocean, implying that no record was collected. Another important observation is that there are less missing data near by the mainland along its border and ~30°N.
# Region 1 (r_1): [45 – 50° N, -25 – -20° E]
# Region 2 (r_2): [20 – 25° N, -30 – -25° E]
# Run this when runtime = 0
r_1 = create_data_dict_raw(lat_min=45, lat_max=50, long_min=-25, long_max=-20)
r_2 = create_data_dict_raw(lat_min=20, lat_max=25, long_min=-30, long_max=-25)
# Saving to the pickle object
save_to_pickle(data_object = r_1, filename="r_1")
save_to_pickle(data_object = r_2, filename="r_2")
# Loading from the pickle objects after the initial setup
r_1 = read_from_pickle("r_1")
r_2 = read_from_pickle("r_2")
def compare_two_regions(year=2003, start_month=1, end_month=12):
print ("- Comparing the two regions for {} between {} and {}".format(year, start_month, end_month))
fig, axes = plt.subplots(1,2,figsize=(20,8))
axes = axes.flatten()
i = 0
while i < 2:
ax = axes[i]
for x in range (start_month, end_month+1):
df_temp = r_1[year] if i == 0 else r_2[year]
df = df_temp[df_temp.month == x]
im = ax.scatter(x=df.longitude,
y=df.latitude,
alpha=0.5,
c= df.chlor_a_val,
cmap="viridis")
ax.set_title("Region {}".format(i+1))
ax.set_xlim(df_temp.longitude.min(), df_temp.longitude.max())
ax.set_ylim(df_temp.latitude.min(), df_temp.latitude.max())
ax.set_xlabel("Longitude")
ax.set_ylabel("Latitude")
i += 1
fig.colorbar(im, ax=ax);
fig.savefig(("/").join([results_folder_path,
"compare_regions_"+str(year)+ "_" + str(start_month)+"_"+str(end_month)+".png"]))
compare_two_regions(year=2005, start_month=10, end_month=12)
## This is very hacky! limit the y axis to (0,10)
def compare_two_regions_by_timeseries(year):
plt.figure(figsize=(15,7))
plt.subplot(121)
df = r_1[year]
plot_1 = df.groupby("month").agg({"chlor_a_val":{"r_1_min":"min",
"r_1_mean":"mean",
"r_1_max":"max"}})
plot_1.columns = plot_1.columns.droplevel(0)
plt.plot(plot_1)
plt.title("Region 1")
plt.ylim(0,10)
plt.subplot(122)
df = r_2[year]
plot_2 = df.groupby("month").agg({"chlor_a_val":{"r_2_min":"min",
"r_2_mean":"mean",
"r_2_max":"max"}})
plot_2.columns = plot_2.columns.droplevel(0)
plt.plot(plot_2)
plt.title("Region 2")
plt.ylim(0,10);
for x in range(2002,2020):
compare_two_regions_by_timeseries(year=x)
## This is very hacky! limit the y axis to (0,1)
def compare_two_regions_by_timeseries(year):
plt.figure(figsize=(15,7))
plt.subplot(121)
df = r_1[year]
plot_1 = df.groupby("month").agg({"chlor_a_val":{"r_1_min":"min",
"r_1_mean":"mean",
"r_1_max":"max"}})
plot_1.columns = plot_1.columns.droplevel(0)
plt.plot(plot_1)
plt.title("Region 1")
plt.ylim(0,1)
plt.subplot(122)
df = r_2[year]
plot_2 = df.groupby("month").agg({"chlor_a_val":{"r_2_min":"min",
"r_2_mean":"mean",
"r_2_max":"max"}})
plot_2.columns = plot_2.columns.droplevel(0)
plt.plot(plot_2)
plt.title("Region 2")
plt.ylim(0,1);
for x in range(2002,2020):
compare_two_regions_by_timeseries(year=x)
**Comment:**
In the graph, the green, orange, blue lines suggest max, mean, min value of the chlorophyll-a concentration for the year of interest. According to the graphs above, it is evident that the region 1 has a far greater interannual variability. Region 1 has variability as large as 10, where as region 2 remains fairly steady below 1. In addition, although no conspicuous seasonality was observed in the region 2, region 1 tended to have a weak seasonality where it shows an abrupt increase in concentration around in June every year.
def modify_column(x):
new_col_name = ("_").join([str(x.year.loc[0]), str(x.month.loc[0])])
x[new_col_name] = x.chlor_a_val
return x[["longitude", "latitude", new_col_name]]
def merge_data(temp_df):
assert len(temp_df) > 0, "There is no data to merge on"
temp_final = temp_df[0]
for i in range(len(temp_df)-1):
temp_final = pd.merge(temp_final, temp_df[i+1],
on=["longitude", "latitude"],
how="inner")
assert temp_final.shape[0] == 864000, "There could be some information due to missing values"
return temp_final
def get_df_mg_dict(data_collection):
df_mg_d = {}
for year, data in data_collection.items():
print ("Processing year: {}".format(year))
temp_df = list(map(modify_column, data))
df_mg_d[year] = merge_data(temp_df)
return df_mg_d
data_collection_merged = get_df_mg_dict(data_collection=data_collection_raw)
save_to_pickle(data_object=data_collection_merged, file_name="data_collection_merged")
**Comment:**
Another dataframe for historical data was prepared. It's variable name is "df_complete" to perform K-means clustering.
# Preparing and saving the data into csv
df_complete = merge_data(list(data_collection_merged.values()))
df_complete.to_csv(("/").join([pickle_folder_path, "historical_data.csv"]), index=False)
df_complete.head()
# Reading the data
df_complete = pd.read_csv(("/").join([pickle_folder_path, "historical_data.csv"]))
test = df_complete.copy()
test["num_missing"] = test.isnull().sum(axis=1)
test["num_missing_percentage"] = test.num_missing/df_complete.shape[1]*100
test["covered_percentage"] = 100.0 - test.num_missing_percentage
test.head(2)
# Data summary 1
test.covered_percentage.describe()
# Data summary 2
test[test["covered_percentage"]>10].covered_percentage.describe()
**Comment:**
Although the value of 80 and the value of K for K-means clustering algroithm are arbitraily chosen due to the time constraint, in the real analysis it should be carefully chosen through cross-validation and "elbow" method.
# Drop data whose data availability is below 80 percent to speed up computing process
test_bfill = test[test.covered_percentage > 80].fillna(method="bfill")
test_bfill.info()
y = np.array(test_bfill[["longitude", "latitude"]])
X = np.array(test_bfill.iloc[:,2:-3])
kmeans = KMeans(n_clusters=5, init='k-means++')
kmeans.fit(X)
test_bfill["cluster"] = kmeans.labels_
test_bfill.head()
# Save the data into csv
test_bfill.to_csv(("/").join([pickle_folder_path, "cluster_result.csv"]), index=False)
# Reading the data
test_bfill = pd.read_csv(("/").join([pickle_folder_path, "cluster_result.csv"]))
def plot_clusters(df=test_bfill, limit_lat_long=False, with_land=False):
cmap=plt.get_cmap("viridis")
colors = cmap(np.linspace(0, 1, num=5))
fig, ax = plt.subplots(1,1,figsize=(30,20)) #adjust if plotting the full image!
df.longitude = df.longitude.astype(float)
if limit_lat_long:
df = df[(df.longitude < 0) & (df.longitude >= -50)]
num = 0
for cluster, df in df.groupby("cluster"):
ax.scatter(x=df.longitude,
y=df.latitude,
alpha=0.5,
color=colors[num],
label="cluster_"+str(cluster))
num +=1
if with_land:
test = data_dict_raw[2003] #2003 arbitraily chosen
test_temp = test[test.isnull().any(axis=1)]
ax.scatter(x=test_temp[test_temp.month == 9].longitude, y=test_temp[test_temp.month == 9].latitude,
alpha=0.05, color='#E8E8E8')
ax.set_title("Cluster plot")
ax.set_xlim(df.longitude.min(), df.longitude.max())
ax.set_ylim(df.latitude.min(), df.latitude.max())
ax.set_xlabel("Longitude")
ax.set_ylabel("Latitude")
ax.legend(bbox_to_anchor=(1.1, 1));
fig.savefig(("/").join([results_folder_path, "k_5_clustering_results_lat_long_limited.png"]))
plot_clusters(limit_lat_long=True, with_land=True)
**Comments:**
Quick Answer: Based on the graph below, it is beneficial to deploy the region in cluster_1, which on average has higher chlorophyll concentration in earlier season than other clusters.
Alternative Option: Assuming that the concentration of chlorophyll-a is a good feature that would be a good indicator of the amount of planktons, find the density plot for each (lat,long) tuple. If the feautre has a normal distribution, compute the individual mean and standard deviation. Determine a threshold value, (ε), above which will be classified as anomaly. Depending on the number of deviation away from the mean, we could distinguish which (lat,long) coordinates tends to have abnomrally higher concentration of chlorophyll-a in earlier season than others.
Although we only have concentration of chlorophyl-a data, it would be extremely helpful to have other features that could help explain and substantiate the anomalies, such as sea surface temperature, wind speed/direction, and so on.
# df for part 2
df_2 = test_bfill.reset_index(drop=True)
columns_to_keep = list(df_2.columns[2:-4])+[df_2.columns[-1]]
df_2 = df_2[columns_to_keep]
df_2.groupby("cluster").agg("mean")
# Save the file to plot in Excel - memory issue. Can be resolved by using an AWS EC2 host.
# df_2.groupby("cluster").agg("mean").reset_index().to_csv(("/").join([pickle_folder_path, "cluster_agg.csv"]), index=False)
**• If the image below is not showing, please check the folder & directory! **
